# -------------------------------------------------------------------
# Purpose: Creates Table B4
# Author:  Max Posch, 25/07/2025
# Usage:   Source this script to generate the table.
# -------------------------------------------------------------------
# Check that required paths exist
stopifnot(dir.exists(pdataanalysis))
stopifnot(dir.exists(poutputappendix))


# Load entropy and population data
load(file.path(pdataanalysis, "countyLevel19001940.RData"))
d <- copy(countyLevel19001940)[year %in% seq(1900, 1940, 10), .(gisjoin_1900, year, entropy_namelast_mp_adjp)][order(year, -entropy_namelast_mp_adjp)]
pop <- copy(countyLevel19001940)[year==1900, .(gisjoin_1900, pop = sum_n_namelast_mp_adjp)]
pop[, pop := pop * 1000]


# Load immigrant share
imm <- copy(countyLevel19001940)[year == 1900, .(gisjoin_1900, immig = share_foreign_born_adjp)]
imm[, immig := immig * 100]


## counties with highest immigration share 1900 
county_mapping <- data.frame(
    gisjoin_1900 = c("G3800470", "G3800890", "G4804790", "G3800670", "G2700690", "G2600530", "G2600330", "G3800190", "G2600950", "G2700750"),
    county_name = c("Logan, ND", "Stark, ND", "Webb, TX", "Pembina, ND", "Kittson, MN", "Gogebic, MI", "Chippewa, MI", "Cavalier, ND", "Luce, MI", "Lake, MN")
)
tab1 <- d %>%
    filter(gisjoin_1900 %in% county_mapping$gisjoin_1900) %>%
    pivot_wider(names_from = year, values_from = entropy_namelast_mp_adjp, names_prefix = "year_", id_cols = gisjoin_1900) %>%
    inner_join(imm) %>%
    inner_join(pop) %>%
    select(gisjoin_1900, pop, immig, everything()) %>%
    arrange(desc(immig))

tab1_with_county <- tab1 %>%
    left_join(county_mapping, by = "gisjoin_1900")
tab1_with_county <- select(tab1_with_county, county_name, pop, immig, everything(), -gisjoin_1900)
tab1_with_county$pop <- format(round(tab1_with_county$pop), big.mark = ",")
tab1_with_county$immig <- sprintf("%.1f%%", tab1_with_county$immig)
tab1_xtable <- xtable(tab1_with_county)
tablename <- file.path(poutputappendix, "tableB04high.tex")
print(tab1_xtable,
    include.rownames = FALSE,
    file = tablename
)
get_clustering_coefs_rows(tablename)


## counties with lowest immigration share 1900  --------------------------------------------------------------
county_mapping <- data.frame(
    gisjoin_1900 = c("G1301450", "G4700870", "G2101150", "G1301490", "G1301770", "G1301190", "G2101930", "G3700050", "G2101890", "G1300070"),
    county_name = c("Harris, GA", "Jackson, TN", "Johnson, KY", "Heard, GA", "Lee, GA", "Franklin, GA", "Perry, KY", "Alleghany, NC", "Owsley, KY", "Baker, GA")
)
tab2 <- d %>%
    filter(gisjoin_1900 %in% county_mapping$gisjoin_1900) %>%
    pivot_wider(names_from = year, values_from = entropy_namelast_mp_adjp, names_prefix = "year_", id_cols = gisjoin_1900) %>%
    inner_join(imm) %>%
    inner_join(pop) %>%
    select(gisjoin_1900, pop, immig, everything()) %>%
    arrange(immig)

tab2_with_county <- tab2 %>%
    left_join(county_mapping, by = "gisjoin_1900")
tab2_with_county <- select(tab2_with_county, county_name, pop, immig, everything(), -gisjoin_1900)
tab2_with_county$pop <- format(round(tab2_with_county$pop), big.mark = ",")
tab2_with_county$immig <- sprintf("%.3f%%", tab2_with_county$immig)
tab2_xtable <- xtable(tab2_with_county)
tablename <- file.path(poutputappendix, "tableB04low.tex")
print(tab2_xtable,
    include.rownames = FALSE,
    file = tablename
)
get_clustering_coefs_rows(tablename)
